library(RColorBrewer) #to set color palette for heatmap
library(gplots) #to use heatmap.2
library(ggplot2)#to use ggplot
library(vegan) # to make Mantel correlogram, CCA, and so on
library(ape) #to use read.dna function
library(plyr) # to use ddply
library(Mfuzz) # to do soft clustering
library(caret) #for doing the min-max scaling
library(dplyr) #to use tibble to edit data frame
col_RduBu <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(256) #set the color palette for heatmaps
#input data
data_12years <- read.csv("fcm 12 years.csv")
#plot synechococcus on the figure
plot(data_12years$decimalyears,data_12years$SynechococcusMEAN, log="y", type = "b", ylim=c(1e+01,2e+07), col = "purple", xlab = "", ylab = "Abundance (cells/mL)", lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#add picocyanobaceria data on the plot
lines(data_12years$decimalyears,data_12years$picocyanobacteriaMEAN,log="y", col = "dark green", type = "b", lty = 1, lwd = 1.3)
#add heterotrophic bacteria data on the plot
lines(data_12years$decimalyears,data_12years$heterotrophic.bacteria, log="y", col = "black", type = "b", lty = 1, lwd = 1.3)
#add a legend to the plot and set legend lty
legend("bottomright", legend = c("Synechococcus", "Picocyanobacteria","Heterotrophic Bacteria"), col = c("purple", "dark green","black"), lty = 1, cex = 1)
#### Figure 1B. 2011-2013 Flow cytometric based abundances of Synechococcus and non-phycoerythrin containing picocyanobacteria.
#input data
data_3years <- read.csv("fcm 3 years.csv")
#plot synechococcus on the figure
plot(data_3years$decimalyears,data_3years$SynechococcusMEAN, log = "y", type = "b", ylim=c(1e+01,1e+07), col = "purple", xlab = "", ylab = "Abundance (cells/mL)", lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#add cyanobacteria data on the plot
lines(data_3years$decimalyears,data_3years$picocyanobacteriaMEAN, log = "y", col = "dark green", type = "b", lty = 1, lwd = 1.3)
#add a legend to the plot and set legend lty
legend("topleft", legend = c("Synechococcus", "picocyanobacteria"), col = c("purple", "dark green"), lty = 1, cex = 1)
#plot heterotrophic bacteria data
plot(data_3years$decimalyears, data_3years$heterotrophic.bacteria, log = "y", type = "b", col = "black", ylim=c(8e+05,9e+06), xlab = "", ylab = "Abundance (cells/mL)", lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#Add a legend to the plot and set legend lty
legend("topright", legend = c("Bacteria"), col = "black", lty = 1, cex = 1)
#input data
cotu <- read.csv("cotu by tree order.csv", row.names=1) #Cyanobacteria OTU absolute abundance data ranked by phylogenetic tree order
#log transform the data
cotu_log <- log2(cotu+1)
#plot the figure
library(gplots) #to use heatmap.2
heatmap.2(as.matrix(cotu_log), main = "Heatmap of Cyano OTUs (phylogeny)", xlab = "Samples", ylab = "OTUs", cexCol =0.5, cexRow = 0.5, keysize = 1, lhei = c(1,7), key.xlab = "Log2 (absolute abundance+1), cells/mL", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#add the phylogenetic tree in the Adobe Illustrator
#input data
coli <- read.csv("coligo by tree order.csv", row.names=1)
#log transform the data
coli_log <- log2(coli +1)
#plot the figure
heatmap.2(as.matrix(coli_log), main = "Heatmap of cyano oligotypes (phylogeny)", xlab = "Samples", ylab = "oligotypes", cexCol =0.5, cexRow = 0.5, keysize = 1, lhei = c(1,7), key.xlab = "Log2 (absolute abundance+1), cells/mL", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#add the phylogenetic tree in the Adobe Illustrator
#input data
sotu <- read.csv("sotu by tree order.csv", row.names=1)
#log transform the data
sotu_log <- log2(sotu +1)
#plot the figure
heatmap.2(as.matrix(sotu_log),main = "Heatmap of SAR11 OTUs (phylogeny)", xlab = "Samples",ylab = "OTUs", cexCol =0.5, cexRow = 0.5,keysize = 1,lhei = c(1,7),key.xlab = "Log2 (absolute abundance+1), cells/mL", notecol="black", density.info="none", trace="none",col=col_RduBu,dendrogram="none",Colv="none",Rowv = "none")
#add the phylogenetic tree in the Adobe Illustrator
#input data
soli <- read.csv("soligo by tree order.csv", row.names=1)
#log transform the data
soli_log <- log2(soli+1)
#plot the figure
heatmap.2(as.matrix(soli_log),main = "Heatmap of SAR11 oligotypes (phylogeny)", xlab = "Samples",ylab = "oligotypes", cexCol =0.5, cexRow = 0.5,keysize = 1,lhei = c(1,7),key.xlab = "Log2 (absolute abundance+1), cells/mL", notecol="black", density.info="none", trace="none",col=col_RduBu,dendrogram="none",Colv="none",Rowv = "none")
#add the phylogenetic tree in the Adobe Illustrator
#input data
##lag BC data were calculated in Excels (cotu_lagBC_calculation.xlsx, coligo_lagBC_calculation.xlsx, sotu_lagBC_calculation.xlsx, soligo_lagBC_calculation.xlsx), then draw figures using R code list below.
cotu_BC <- read.csv("cotu lag BC.csv")
sotu_BC <- read.csv("sotu lag BC.csv")
coli_BC <- read.csv("coligo lag BC.csv")
soli_BC <- read.csv("soligo lag BC.csv")
#format the data and select time lag to be two years
cotu_BC$sd<-as.numeric(cotu_BC$sd)
sotu_BC$sd<-as.numeric(sotu_BC$sd)
soli_BC$sd<-as.numeric(soli_BC$sd)
coli_BC$sd<-as.numeric(coli_BC$sd)
sotu_BC$type<-c("SAR11 OTUs")
cotu_BC$type<-c("Cyano OTUs")
soli_BC$type<-c("SAR11 oligotypes")
coli_BC$type<-c("Cyano oligotypes")
cotu_BC<-cotu_BC[1:104,] #select time lag to be two years
sotu_BC<-sotu_BC[1:104,]
soli_BC<-soli_BC[1:104,]
coli_BC<-coli_BC[1:104,]
BC_otu<-rbind(sotu_BC,cotu_BC)
BC_oligo<-rbind(soli_BC,coli_BC)
BC<-rbind(BC_otu,BC_oligo) #data is ready for plotting
#plot the figure
ggplot(BC,aes(BC$ï..deltaweeks,average,group=type))+ geom_line(aes(shape=type,color=type))+
geom_point(aes(shape=type,color=type))+
scale_shape_manual(values = c(2,15,2,15)) +
scale_size_manual(values=c(10,10,10,10))+
scale_colour_manual(values= c("dark green","dark green","black","black"))+
scale_linetype_manual(values=c("Cyano OTUs" = "solid","Cyano oligotypes" = "dashed","SAR11 OTUs" = "solid","SAR11 oligotypes" = "dashed"))+
labs(y="Average Bray-Curtis Dissimilarity values", x = "Lag (weeks)",color="")+
xlim(0, 104) +
ylim(0.28, 0.71) +
geom_vline(xintercept = 0, color = "black", size=0.5)+
geom_text(aes(x=9, label="", y=0.56), colour="black", angle=0, vjust = -1, text=element_text(size=11))+
geom_vline(xintercept = 52, color = "black", size=0.5)+ geom_text(aes(x=59, label="", y=0.56), colour="black", angle=0, vjust = -1, text=element_text(size=11))+
geom_vline(xintercept = 104, color = "black", size=0.5)+ geom_text(aes(x=111, label="", y=0.56), colour="black", angle=0, vjust = -1, text=element_text(size=11))+
theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black")
)+
labs(color = "Guide name", linetype = "Guide name", shape = "Guide name")
#Run Soft clustering
#input data
coli_part <- read.csv("161coligo by tree order.csv",header=TRUE,row.names=1)
soli_part <- read.csv("304soligo by tree order.csv",header=TRUE,row.names=1)
#Log2 transform
coli_part_log <- log2(coli_part+1)
soli_part_log <- log2(soli_part+1)
# transform the data via min-max scaling method to let each row range from 0 to 1
process_coli_part_log<- preProcess(t(coli_part_log), method=c("range"))
process_soli_part_log<- preProcess(t(soli_part_log), method=c("range"))
norm_scale_coli_part <- predict(process_coli_part_log, t(coli_part_log)) #each otu now is from 0-1
norm_scale_soli_part <- predict(process_soli_part_log, t(soli_part_log)) #each otu now is from 0-1
coli_part_final<-t(norm_scale_coli_part)
soli_part_final<-t(norm_scale_soli_part)
#format the table to expression set
write.table(coli_part_final, file = "coli_data_final.txt", sep = "\t", row.names = TRUE, col.names = NA)
write.table(soli_part_final, file = "soli_data_final.txt", sep = "\t", row.names = TRUE, col.names = NA)
coli_mfuzz <- table2eset(file='coli_data_final.txt')
soli_mfuzz <- table2eset(file='soli_data_final.txt')
#standardise
coli_mfuzz <- standardise(coli_mfuzz)
soli_mfuzz <- standardise(soli_mfuzz)
#### parameter selection6 fill.NA
#for fuzzifier m, we could use Mestimate
m_coli <- mestimate(coli_mfuzz)
m_soli <- mestimate(soli_mfuzz)
#A fuzzifier of 1 is essentially hard clustering.
#Dmin(coli_mfuzz, m=m_coli, crange=seq(2,42,1), repeats=3, visu=TRUE)
#Dmin(soli_mfuzz, m=m_soli, crange=seq(2,42,1), repeats=3, visu=TRUE)
#based on the result, group data into 3 clusters
clust=3
coli_cluster <- mfuzz(coli_mfuzz, c=clust,m=m_coli)
soli_cluster <- mfuzz(soli_mfuzz, c=clust,m=m_soli)
#centers:soli_cluster[1] ; the cluster assignments: soli_cluster[3] ; and the membership scores:soli_cluster[4]
#extracts cluster numbers and membership values
acore_coli <- acore(coli_mfuzz,coli_cluster,min.acore=0)
acore_soli <- acore(soli_mfuzz,soli_cluster,min.acore=0)
acore_list_coli <- do.call(rbind, lapply(seq_along(acore_coli), function(i){ data.frame(CLUSTER=i, acore_coli[[i]])}))
acore_list_soli <- do.call(rbind, lapply(seq_along(acore_soli), function(i){ data.frame(CLUSTER=i, acore_soli[[i]])}))
#export the above cluster information, then calculate the Centroid values in the Excels: "161coligo cluster centorid cal.xlsx" and 304soligo cluster centorid cal.xlsx"
#input centroid data
coli_centroid <- read.csv("161coligo cluster centroid.csv")
colnames(coli_centroid)<-c("no","Date","Cluster 1","Cluster 2","Cluster 3")
soli_centroid <- read.csv("304soligo cluster centroid.csv")
colnames(soli_centroid)<-c("no","Date","Cluster 1","Cluster 2","Cluster 3")
#generate centroid figure
#Cyanobacteria
ggplot(coli_centroid, aes(x=no)) +
geom_line(aes(y = coli_centroid$`Cluster 1`,color="Cluster 1"),size=1,linetype ="dotted") +
geom_line(aes(y = coli_centroid$`Cluster 2`,color="Cluster 2"),size=1,linetype = "dashed")+
geom_line(aes(y = coli_centroid$`Cluster 3`,color="Cluster 3"),size=1)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(y=" Cyano oligotypes Mfuzz cluster centroid of log2(abundance+1)", x = "Date",color="")+
scale_colour_manual(values = c("#0072B2","#D55E00","#009E73"))
#SAR11
ggplot(soli_centroid, aes(x=no)) +
geom_line(aes(y = soli_centroid$`Cluster 1`,color="Cluster 1"),size=1,linetype ="dotted") +
geom_line(aes(y = soli_centroid$`Cluster 2`,color="Cluster 2"),size=1,linetype = "dashed")+
geom_line(aes(y = soli_centroid$`Cluster 3`,color="Cluster 3"),size=1)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(y=" SAR11 oligotypes Mfuzz cluster centroid of log2(abundance+1)", x = "Date",color="")+
scale_colour_manual(values = c("#0072B2","#D55E00","#009E73"))
# Figures were edited in the Adobe Illustrator
##data by tree order after log and min-max scaling: coli_part_final & soli_part_final
#plot the heatmap of cyanobacteria 161 oligotypes
heatmap.2(as.matrix(coli_part_final), main = "Heatmap of Cyanobateria Oligotypes by tree order", xlab = "Samples",ylab = "Oligotypes",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#plot the heatmap of SAR11 304 oligotypes
heatmap.2(as.matrix(soli_part_final),main = "Heatmap of SAR11 Oligotypes by tree order",xlab = "Samples",ylab = "Oligotypes",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#clusters information and phylogenetic trees were added in the Adobe Illustrator
#Three Flow cytometric based abundances plots in Figure S1 were the same ones in Figure 1.See Figure 1 code.
#rank order line charts were plotted in the Excels: cotu_by_rankorder.xlsx, coligo_by_rankorder.xlsx, sotu_by_rankorder.xlsx, soligo_by_rankorder.xlsx
#input cyanobacteria data:cotu_log, coli_log
#input SAR11 data:sotu_log, soli_log
#order the rows by the sum values of each row, from highest to lowest
cotu_log_order<-cotu_log[order(rowSums(cotu_log),decreasing=T),]
coli_log_order<-coli_log[order(rowSums(coli_log),decreasing=T),]
sotu_log_order<-sotu_log[order(rowSums(sotu_log),decreasing=T),]
soli_log_order<-soli_log[order(rowSums(soli_log),decreasing=T),]
#plot heatmaps
heatmap.2(as.matrix(cotu_log_order),main = "Heatmap of Cyano OTUs (rankorder)", cexCol =0.5,cexRow = 0.5,keysize = 1,lhei = c(1,7), key.xlab = "Log2 (abs abundance+1)",notecol="black",density.info="none",trace="none", col=col_RduBu,dendrogram="none",Colv="none", Rowv = "none")
heatmap.2(as.matrix(coli_log_order),main = "Heatmap of Cyano Oligotypes (rankorder)", cexCol =0.5,cexRow = 0.5,keysize = 1,lhei = c(1,7), key.xlab = "Log2 (abs abundance+1)",notecol="black",density.info="none",trace="none", col=col_RduBu,dendrogram="none",Colv="none", Rowv = "none")
heatmap.2(as.matrix(sotu_log_order), main = "Heatmap of SAR11 OTUs (rankorder)", cexCol =0.5,cexRow = 0.5,keysize = 1,lhei = c(1,7), key.xlab = "Log2 (abs abundance+1)",notecol="black",density.info="none",trace="none", col=col_RduBu,dendrogram="none",Colv="none", Rowv = "none")
heatmap.2(as.matrix(soli_log_order), main = "Heatmap of SAR11 Oligotypes (rankorder)", cexCol =0.5,cexRow = 0.5,keysize = 1,lhei = c(1,7), key.xlab = "Log2 (abs abundance+1)",notecol="black",density.info="none",trace="none", col=col_RduBu,dendrogram="none",Colv="none", Rowv = "none")
#get data ready: cyanobacteria OTUs absolute abundance tables: cotu
cotu_2 <- t(cotu)
cotu_3 <- tibble::rownames_to_column(as.data.frame(cotu_2), "sites")
#calculate shannon diversity
COTUshanno<-ddply(cotu_3,~cotu_3$sites,function(x) {
data.frame(SHANNON=diversity(x[-1], index="shannon"))
})
COTUshanno_2 <- tibble::rownames_to_column(as.data.frame(COTUshanno), "no")
#calculate richness
COTUrich<-ddply(cotu_3,~cotu_3$sites,function(x) {
data.frame(RICHNESS=sum(x[-1]>0))
})
COTUrich_2 <- tibble::rownames_to_column(as.data.frame(COTUrich), "no")
#calculate evenness
COTUeven<-ddply(cotu_3,~cotu_3$sites,function(x) {
data.frame(SIMPSON=diversity(x[-1], index="simpson")/log(sum(x[-1]>0)))
})
COTUeven_2 <- tibble::rownames_to_column(as.data.frame(COTUeven), "no")
#plot figures
#plot the shanno diversity figure
plot(COTUshanno_2$no, COTUshanno_2$SHANNON, type = "b", col = "black",ylim = c(0,3),lty = 1, lwd = 1.3,cex.lab = 1.4, cex.axis = 1.3)
#plot the richness figure
plot(COTUrich_2$no, COTUrich_2$RICHNESS, type = "b", col = "black",ylim = c(0,90),lty = 1, lwd = 1.3, cex.lab = 1.4,cex.axis = 1.3)
#plot the evenness figure
plot(COTUeven_2$no, COTUeven_2$SIMPSON, type = "b", col = "black", ylim = c(0,0.3),lty = 1, lwd = 1.3,cex.lab = 1.4,cex.axis = 1.3)
#add year information in the Adobe Illustrator
#get data ready: SAR11 OTUs absolute abundance tables: sotu
sotu_2 <- t(sotu)
sotu_3 <- tibble::rownames_to_column(as.data.frame(sotu_2), "sites")
SOTUshanno<-ddply(sotu_3,~sotu_3$sites,function(x) {
data.frame(SHANNON=diversity(x[-1], index="shannon"))
})
SOTUshanno_2 <- tibble::rownames_to_column(as.data.frame(SOTUshanno), "no")
SOTUrich<-ddply(sotu_3,~sotu_3$sites,function(x) {
data.frame(RICHNESS=sum(x[-1]>0))
})
SOTUrich_2 <- tibble::rownames_to_column(as.data.frame(SOTUrich), "no")
SOTUeven<-ddply(sotu_3,~sotu_3$sites,function(x) {
data.frame(SIMPSON=diversity(x[-1], index="simpson")/log(sum(x[-1]>0)))
})
SOTUeven_2 <- tibble::rownames_to_column(as.data.frame(SOTUeven), "no")
plot(SOTUshanno_2$no, SOTUshanno_2$SHANNON, type = "b", col = "black",ylim = c(0,3), lty = 1, lwd = 1.3,cex.lab = 1.4,cex.axis = 1.3)
plot(SOTUrich_2$no, SOTUrich_2$RICHNESS, type = "b", col = "black", ylim = c(0,90),lty = 1, lwd = 1.3,cex.lab = 1.4,cex.axis = 1.3)
plot(SOTUeven_2$no, SOTUeven_2$SIMPSON, type = "b", col = "black", ylim = c(0,0.3),lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#add year information in the Adobe Illustrator
#get data ready: cyanobacteria oligotypes absolute abundance tables: coli
coli_2 <- t(coli)
coli_3 <- tibble::rownames_to_column(as.data.frame(coli_2), "sites")
COLIshanno<-ddply(coli_3,~coli_3$sites,function(x) {
data.frame(SHANNON=diversity(x[-1], index="shannon"))
})
COLIshanno_2 <- tibble::rownames_to_column(as.data.frame(COLIshanno), "no")
COLIrich<-ddply(coli_3,~coli_3$sites,function(x) {
data.frame(RICHNESS=sum(x[-1]>0))
})
COLIrich_2 <- tibble::rownames_to_column(as.data.frame(COLIrich), "no")
COLIeven<-ddply(coli_3,~coli_3$sites,function(x) {
data.frame(SIMPSON=diversity(x[-1], index="simpson")/log(sum(x[-1]>0)))
})
COLIeven_2 <- tibble::rownames_to_column(as.data.frame(COLIeven), "no")
plot(COLIshanno_2$no, COLIshanno_2$SHANNON,type = "b", col = "black", ylim = c(0,5), lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
plot(COLIrich_2$no, COLIrich_2$RICHNESS, type = "b", col = "black", ylim = c(0,200),lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
plot(COLIeven_2$no, COLIeven_2$SIMPSON, type = "b", col = "black", ylim = c(0,0.3), lty = 1,lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#add year information in the Adobe Illustrator
#get data ready: SAR11 oligotypes absolute abundance tables: soli
soli_2 <- t(soli)
soli_3 <- tibble::rownames_to_column(as.data.frame(soli_2), "sites")
SOLIshanno<-ddply(soli_3,~soli_3$sites,function(x) {
data.frame(SHANNON=diversity(x[-1], index="shannon"))
})
SOLIshanno_2 <- tibble::rownames_to_column(as.data.frame(SOLIshanno), "no")
SOLIrich<-ddply(soli_3,~soli_3$sites,function(x) {
data.frame(RICHNESS=sum(x[-1]>0))
})
SOLIrich_2 <- tibble::rownames_to_column(as.data.frame(SOLIrich), "no")
SOLIeven<-ddply(soli_3,~soli_3$sites,function(x) {
data.frame(SIMPSON=diversity(x[-1], index="simpson")/log(sum(x[-1]>0)))
})
SOLIeven_2 <- tibble::rownames_to_column(as.data.frame(SOLIeven), "no")
plot(SOLIshanno_2$no, SOLIshanno_2$SHANNON, type = "b", col = "black", ylim = c(0,5), lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
plot(SOLIrich_2$no, SOLIrich_2$RICHNESS,type = "b", col = "black", ylim = c(0,450), lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
plot(SOLIeven_2$no, SOLIeven_2$SIMPSON, type = "b", col = "black", ylim = c(0,0.3), lty = 1, lwd = 1.3, cex.lab = 1.4, cex.axis = 1.3)
#add year information in the Adobe Illustrator
#get data ready: combine cyanobacteria and SAR11 oligotypes data together
all_log<-rbind(coli_log, soli_log)
#order the rows by the sum values of each row, from highest to lowest
all_log_order<-all_log[order(rowSums(all_log),decreasing=T),]
#plot the heatmap
heatmap.2(as.matrix(all_log_order), main = "Heatmap of cyanobacteria and SAR11 Oligotypes (rankorder)", cexCol =0.5, cexRow = 0.5, keysize = 1, lhei = c(1,7), key.xlab = "Log2 (abs abundance+1)", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#the rank order line chart and a column of colors showing whether the oligotype is SAR11 or Cyanobacteria were plotted in the Excel: together-oligo-by rank order.xlsx
#Plots were combined in the Adobe Illustrator
#input data: cotu
cotu_t<-t(cotu)
#prep meta data
meta <- read.csv("environmental data.csv", header = T)
# predictor variables
fixed <- meta[,3:9]
row.names(fixed)<-row.names(cotu_t)
#Canonincal correspondence analysis (CCA)
cotu.cca <- cca(cotu_t ~ Temp + MLLW + Salinity + Chlorophyll.a + NH4 + InsolationProjectedDaily, data=fixed)
#compute variance inflation factors (VIF) to check for redundancy in the predictor variables.
vif.cca(cotu.cca)
## Temp MLLW Salinity
## 4.895899 1.177729 2.434063
## Chlorophyll.a NH4 InsolationProjectedDaily
## 1.971077 1.117302 2.847869
#all of our VIFs are now reasonable.
#visualize
cotu_cca <- summary(cotu.cca)
cotu_cca_pr.site <- data.frame(cotu_cca$sites)[1:2]
cotu_cca_pr.site$name <- rownames(cotu_cca_pr.site)
cotu_cca_pr.site$yearday <- meta$YearDay
cotu_cca_pr.env <- data.frame(cotu_cca$biplot)[1:2]
cotu_cca_pr.env$name <- rownames(cotu_cca_pr.env)
cotu_cca1_varex<-round(summary(cotu.cca)$cont$importance[2,1]*100,2) #Get percentage of variance explained by first axis
cotu_cca2_varex<-round(summary(cotu.cca)$cont$importance[2,2]*100,2) #Get percentage of variance explained by second axis
#CCA plots
ggplot(cotu_cca_pr.site, aes(CCA1, CCA2)) +
geom_point(aes(color = yearday), alpha = 0.4, size = 2) +
scale_color_gradient2(low = "red", mid = "darkblue", high = "yellow", midpoint = mean(cotu_cca_pr.site$yearday))+
ylim(-6, 4) +
xlim(-6, 4) +
geom_vline(xintercept = 0, color = 'gray', size = 0.5) +
geom_hline(yintercept = 0, color = 'gray', size = 0.5) +
coord_fixed()+
geom_segment(data=cotu_cca_pr.env,aes(x=0,xend=CCA1*2.5,y=0,yend=CCA2*2.5),color="black",arrow=arrow(length=unit(0.02,"npc")),) +
geom_text(data=cotu_cca_pr.env,aes(x=CCA1*2.5,y=CCA2*2.5,label=rownames(cotu_cca_pr.env),hjust=0.5*(1-sign(CCA1)),vjust=0.5*(1-sign(CCA2))),color="black") +
theme_bw() +
labs(x=paste0("CCA1 (",cotu_cca1_varex," %)"),
y=paste0("CCA2 (",cotu_cca2_varex," %)"))
#edit figures in the Adobe Illustrator
#input data: coli
coli_t<-t(coli)
#Canonincal correspondence analysis (CCA)
coli.cca <- cca(coli_t ~ Temp + MLLW + Salinity + Chlorophyll.a + NH4 + InsolationProjectedDaily, data=fixed)
#compute variance inflation factors (VIF) to check for redundancy in the predictor variables.
vif.cca(coli.cca)
## Temp MLLW Salinity
## 4.740688 1.187173 2.336848
## Chlorophyll.a NH4 InsolationProjectedDaily
## 2.032931 1.302056 2.820359
#all of our VIFs are now reasonable.
#visualize
coli_cca <- summary(coli.cca)
coli_cca_pr.site <- data.frame(coli_cca$sites)[1:2]
coli_cca_pr.site$name <- rownames(coli_cca_pr.site)
coli_cca_pr.site$yearday <- meta$YearDay
coli_cca_pr.site$CCA2<-coli_cca_pr.site$CCA2*(-1)
coli_cca_pr.env <- data.frame(coli_cca$biplot)[1:2]
coli_cca_pr.env$name <- rownames(coli_cca_pr.env)
coli_cca_pr.env$CCA2<-coli_cca_pr.env$CCA2*(-1)
coli_cca1_varex<-round(summary(coli.cca)$cont$importance[2,1]*100,2)
coli_cca2_varex<-round(summary(coli.cca)$cont$importance[2,2]*100,2)
#CCA plots
ggplot(coli_cca_pr.site, aes(CCA1, CCA2)) +
geom_point(aes(color = yearday), alpha = 0.4, size = 2) +
scale_color_gradient2(low = "red", mid = "darkblue", high = "yellow", midpoint = mean(coli_cca_pr.site$yearday))+
ylim(-6, 4) +
xlim(-6, 4) +
geom_vline(xintercept = 0, color = 'gray', size = 0.5) +
geom_hline(yintercept = 0, color = 'gray', size = 0.5) +
coord_fixed()+
geom_segment(data=coli_cca_pr.env,aes(x=0,xend=CCA1*2.5,y=0,yend=CCA2*2.5),color="black",arrow=arrow(length=unit(0.02,"npc")),) +
geom_text(data=coli_cca_pr.env,aes(x=CCA1*2.5,y=CCA2*2.5,label=rownames(coli_cca_pr.env),hjust=0.5*(1-sign(CCA1)),vjust=0.5*(1-sign(CCA2))),color="black") +
theme_bw() +
labs(x=paste0("CCA1 (",coli_cca1_varex," %)"),
y=paste0("CCA2 (",coli_cca2_varex," %)"))
#edit figures in the Adobe Illustrator
#input data: sotu
sotu_t<-t(sotu)
#Canonincal correspondence analysis (CCA)
sotu.cca <- cca(sotu_t ~ Temp + MLLW + Salinity + Chlorophyll.a + NH4 + InsolationProjectedDaily, data=fixed)
#compute variance inflation factors (VIF) to check for redundancy in the predictor variables.
vif.cca(sotu.cca)
## Temp MLLW Salinity
## 5.640303 1.227417 2.257639
## Chlorophyll.a NH4 InsolationProjectedDaily
## 2.247858 1.195433 2.727376
#all of our VIFs are now reasonable.
#visualize
sotu_cca <- summary(sotu.cca)
sotu_cca_pr.site <- data.frame(sotu_cca$sites)[1:2]
sotu_cca_pr.site$name <- rownames(sotu_cca_pr.site)
sotu_cca_pr.site$yearday <- meta$YearDay
sotu_cca_pr.site$CCA2<-sotu_cca_pr.site$CCA2*(-1)
sotu_cca_pr.site$CCA1<-sotu_cca_pr.site$CCA1*(-1)
sotu_cca_pr.env <- data.frame(sotu_cca$biplot)[1:2]
sotu_cca_pr.env$name <- rownames(sotu_cca_pr.env)
sotu_cca_pr.env$CCA2<-sotu_cca_pr.env$CCA2*(-1)
sotu_cca_pr.env$CCA1<-sotu_cca_pr.env$CCA1*(-1)
sotu_cca1_varex<-round(summary(sotu.cca)$cont$importance[2,1]*100,2)
sotu_cca2_varex<-round(summary(sotu.cca)$cont$importance[2,2]*100,2)
#CCA plots
#library(ggplot2)
ggplot(sotu_cca_pr.site, aes(CCA1, CCA2)) +
geom_point(aes(color = yearday), alpha = 0.4, size = 2) +
scale_color_gradient2(low = "red", mid = "darkblue", high = "yellow", midpoint = mean(sotu_cca_pr.site$yearday))+
ylim(-6, 4) +
xlim(-6, 4) +
geom_vline(xintercept = 0, color = 'gray', size = 0.5) +
geom_hline(yintercept = 0, color = 'gray', size = 0.5) +
coord_fixed()+
geom_segment(data=sotu_cca_pr.env,aes(x=0,xend=CCA1*2.5,y=0,yend=CCA2*2.5),color="black",arrow=arrow(length=unit(0.02,"npc")),) +
geom_text(data=sotu_cca_pr.env,aes(x=CCA1*2.5,y=CCA2*2.5,label=rownames(sotu_cca_pr.env),hjust=0.5*(1-sign(CCA1)),vjust=0.5*(1-sign(CCA2))),color="black") +
theme_bw() +
labs(x=paste0("CCA1 (",sotu_cca1_varex," %)"),
y=paste0("CCA2 (",sotu_cca2_varex," %)"))
#edit figures in the Adobe Illustrator
#input data: soli
soli_t<-t(soli)
#Canonincal correspondence analysis (CCA)
soli.cca <- cca(soli_t ~ Temp + MLLW + Salinity + Chlorophyll.a + NH4 + InsolationProjectedDaily, data=fixed)
#compute variance inflation factors (VIF) to check for redundancy in the predictor variables.
vif.cca(soli.cca)
## Temp MLLW Salinity
## 5.640303 1.227417 2.257639
## Chlorophyll.a NH4 InsolationProjectedDaily
## 2.247858 1.195433 2.727376
#all of our VIFs are now reasonable.
#visualize
soli_cca <- summary(soli.cca)
soli_cca_pr.site <- data.frame(soli_cca$sites)[1:2]
soli_cca_pr.site$name <- rownames(soli_cca_pr.site)
soli_cca_pr.site$yearday <- meta$YearDay
soli_cca_pr.env <- data.frame(soli_cca$biplot)[1:2]
soli_cca_pr.env$name <- rownames(soli_cca_pr.env)
soli_cca1_varex<-round(summary(soli.cca)$cont$importance[2,1]*100,2)
soli_cca2_varex<-round(summary(soli.cca)$cont$importance[2,2]*100,2)
#CCA plots
ggplot(soli_cca_pr.site, aes(CCA1, CCA2)) +
geom_point(aes(color = yearday), alpha = 0.4, size = 2) +
scale_color_gradient2(low = "red", mid = "darkblue", high = "yellow", midpoint = mean(soli_cca_pr.site$yearday))+
ylim(-6, 4) +
xlim(-6, 4) +
geom_vline(xintercept = 0, color = 'gray', size = 0.5) +
geom_hline(yintercept = 0, color = 'gray', size = 0.5) +
coord_fixed()+
geom_segment(data=soli_cca_pr.env,aes(x=0,xend=CCA1*2.5,y=0,yend=CCA2*2.5),color="black",arrow=arrow(length=unit(0.02,"npc")),) +
geom_text(data=soli_cca_pr.env,aes(x=CCA1*2.5,y=CCA2*2.5,label=rownames(soli_cca_pr.env),hjust=0.5*(1-sign(CCA1)),vjust=0.5*(1-sign(CCA2))),color="black") +
theme_bw() +
labs(x=paste0("CCA1 (",soli_cca1_varex," %)"),
y=paste0("CCA2 (",soli_cca2_varex," %)"))
#edit figures in the Adobe Illustrator
#input data: oligotypes ranked by cluster and rank order
coli_cluster_rank <- read.csv("161coligo cluster rank order.csv",header=TRUE,row.names=1)
soli_cluster_rank <- read.csv("304soligo cluster rank order.csv",header=TRUE,row.names=1)
#min-max scaling [log2(absolute abundance+1)]
coli_cluster_rank_log <- log2(coli_cluster_rank+1)
soli_cluster_rank_log <- log2(soli_cluster_rank+1)
process_coli_cluster_rank_log <- preProcess(t(coli_cluster_rank_log), method=c("range"))
process_soli_cluster_rank_log <- preProcess(t(soli_cluster_rank_log), method=c("range"))
norm_coli_cluster_rank_log <- predict(process_coli_cluster_rank_log, t(coli_cluster_rank_log)) #each otu now is from 0-1
norm_soli_cluster_rank_log <- predict(process_soli_cluster_rank_log, t(soli_cluster_rank_log)) #each otu now is from 0-1
coli_cluster_rank_log_2<-t(norm_coli_cluster_rank_log)
soli_cluster_rank_log_2<-t(norm_soli_cluster_rank_log)
heatmap.2(as.matrix(coli_cluster_rank_log_2), main = "Heatmap of Cyanobateria Oligotypes", xlab = "Samples", ylab = "Oligotypes", cexCol =0.6, cexRow = 0.8, keysize = 1, lhei = c(1,7), key.xlab = "min-max scaling [log2(abs abundance+1)]", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
heatmap.2(as.matrix(soli_cluster_rank_log_2), main = "Heatmap of SAR11 Oligotypes", xlab = "Samples", ylab = "Oligotypes", cexCol =0.6, cexRow = 0.8, keysize = 1, lhei = c(1,7), key.xlab = "min-max scaling [log2(abs abundance+1)]", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#edit figures in the Adobe Illustrator
#Run Soft clustering
#input data
cotu_part <- read.csv("25cotu by tree order.csv",header=TRUE,row.names=1)
sotu_part <- read.csv("37sotu by tree order.csv",header=TRUE,row.names=1)
#Log2 transform
cotu_part_log <- log2(cotu_part+1)
sotu_part_log <- log2(sotu_part+1)
# transform the data via min-max scaling method to let each row range from 0 to 1
library(caret) #for doing the min-max scaling
process_cotu_part_log<- preProcess(t(cotu_part_log), method=c("range"))
process_sotu_part_log<- preProcess(t(sotu_part_log), method=c("range"))
norm_scale_cotu_part <- predict(process_cotu_part_log, t(cotu_part_log)) #each otu now is from 0-1
norm_scale_sotu_part <- predict(process_sotu_part_log, t(sotu_part_log)) #each otu now is from 0-1
cotu_part_final<-t(norm_scale_cotu_part)
sotu_part_final<-t(norm_scale_sotu_part)
#format the table to expression set
write.table(cotu_part_final, file = "cotu_data_final.txt", sep = "\t", row.names = TRUE, col.names = NA)
write.table(sotu_part_final, file = "sotu_data_final.txt", sep = "\t", row.names = TRUE, col.names = NA)
cotu_mfuzz <- table2eset(file='cotu_data_final.txt')
sotu_mfuzz <- table2eset(file='sotu_data_final.txt')
#standardise
cotu_mfuzz <- standardise(cotu_mfuzz)
sotu_mfuzz <- standardise(sotu_mfuzz)
#### parameter selection6 fill.NA
#for fuzzifier m, we could use Mestimate
m_cotu <- mestimate(cotu_mfuzz)
m_sotu <- mestimate(sotu_mfuzz)
#A fuzzifier of 1 is essentially hard clustering.
#Dmin(cotu_mfuzz, m=m_cotu, crange=seq(2,25,1), repeats=3, visu=TRUE)
#Dmin(sotu_mfuzz, m=m_sotu, crange=seq(2,37,1), repeats=3, visu=TRUE)
#based on the result, group data into 3 clusters
clust=3
cotu_cluster <- mfuzz(cotu_mfuzz, c=clust,m=m_cotu)
sotu_cluster <- mfuzz(sotu_mfuzz, c=clust,m=m_sotu)
#centers:sotu_cluster[1] ; the cluster assignments: sotu_cluster[3] ; and the membership scores:sotu_cluster[4]
#extracts cluster numbers and membership values
acore_cotu <- acore(cotu_mfuzz,cotu_cluster,min.acore=0)
acore_sotu <- acore(sotu_mfuzz,sotu_cluster,min.acore=0)
acore_list_cotu <- do.call(rbind, lapply(seq_along(acore_cotu), function(i){ data.frame(CLUSTER=i, acore_cotu[[i]])}))
acore_list_sotu <- do.call(rbind, lapply(seq_along(acore_sotu), function(i){ data.frame(CLUSTER=i, acore_sotu[[i]])}))
#Save the above cluster data, then calculate the Centroid values in the Excels: "25cotu cluster centorid cal.xlsx" and 37sotu cluster centorid cal.xlsx"
#input centroid data
cotu_centroid <- read.csv("25cotu cluster centroid.csv")
colnames(cotu_centroid)<-c("no","Date","Cluster 1","Cluster 2","Cluster 3")
sotu_centroid <- read.csv("37sotu cluster centroid.csv")
colnames(sotu_centroid)<-c("no","Date","Cluster 1","Cluster 2","Cluster 3")
#generate centroid figures
#Cyanobacteria
ggplot(cotu_centroid, aes(x=no)) +
geom_line(aes(y = cotu_centroid$`Cluster 1`,color="Cluster 1"),size=1,linetype ="dotted") +
geom_line(aes(y = cotu_centroid$`Cluster 2`,color="Cluster 2"),size=1,linetype = "dashed")+
geom_line(aes(y = cotu_centroid$`Cluster 3`,color="Cluster 3"),size=1)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(y=" Cyano otus Mfuzz cluster centroid of log2(abundance+1)", x = "Date",color="")+
scale_colour_manual(values = c("#0072B2","#D55E00","#009E73"))
#SAR11
ggplot(sotu_centroid, aes(x=no)) +
geom_line(aes(y = sotu_centroid$`Cluster 1`,color="Cluster 1"),size=1,linetype ="dotted") +
geom_line(aes(y = sotu_centroid$`Cluster 2`,color="Cluster 2"),size=1,linetype = "dashed")+
geom_line(aes(y = sotu_centroid$`Cluster 3`,color="Cluster 3"),size=1)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(y=" SAR11 otus Mfuzz cluster centroid of log2(abundance+1)", x = "Date",color="")+
scale_colour_manual(values = c("#0072B2","#D55E00","#009E73"))
# Figures were edited in the Adobe Illustrator
##data by tree order after log and min-max scaling: cotu_part_final & sotu_part_final
#plot the heatmap of cyanobacteria 25 OTUs
heatmap.2(as.matrix(cotu_part_final), main = "Heatmap of Cyanobateria OTUs by tree order", xlab = "Samples",ylab = "OTUs",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#plot the heatmap of SAR11 37 OTUs
heatmap.2(as.matrix(sotu_part_final),main = "Heatmap of SAR11 OTUs by tree order",xlab = "Samples",ylab = "OTUs",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#clusters information and phylogenetic trees were added in the Adobe Illustrator
#input data: OTUs ranked by cluster and rank order
cotu_cluster_rank <- read.csv("25cotu cluster rank order.csv",header=TRUE,row.names=1)
sotu_cluster_rank <- read.csv("37sotu cluster rank order.csv",header=TRUE,row.names=1)
#min-max scaling [log2(absolute abundance+1)]
cotu_cluster_rank_log <- log2(cotu_cluster_rank+1)
sotu_cluster_rank_log <- log2(sotu_cluster_rank+1)
process_cotu_cluster_rank_log <- preProcess(t(cotu_cluster_rank_log), method=c("range"))
process_sotu_cluster_rank_log <- preProcess(t(sotu_cluster_rank_log), method=c("range"))
norm_cotu_cluster_rank_log <- predict(process_cotu_cluster_rank_log, t(cotu_cluster_rank_log)) #each otu now is from 0-1
norm_sotu_cluster_rank_log <- predict(process_sotu_cluster_rank_log, t(sotu_cluster_rank_log)) #each otu now is from 0-1
cotu_cluster_rank_log_2<-t(norm_cotu_cluster_rank_log)
sotu_cluster_rank_log_2<-t(norm_sotu_cluster_rank_log)
heatmap.2(as.matrix(cotu_cluster_rank_log_2), main = "Heatmap of Cyanobateria OTUs", xlab = "Samples", ylab = "OTUs", cexCol =0.6, cexRow = 0.8, keysize = 1, lhei = c(1,7), key.xlab = "min-max scaling [log2(abs abundance+1)]", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
heatmap.2(as.matrix(sotu_cluster_rank_log_2), main = "Heatmap of SAR11 OTUs", xlab = "Samples", ylab = "OTUs", cexCol =0.6, cexRow = 0.8, keysize = 1, lhei = c(1,7), key.xlab = "min-max scaling [log2(abs abundance+1)]", notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#edit figures in the Adobe Illustrator
##data by tree order after log and min-max scaling: cotu_part_final & sotu_part_final
#plot the heatmap of cyanobacteria 25 OTUs
heatmap.2(as.matrix(cotu_part_final), main = "Heatmap of Cyanobateria OTUs by tree order", xlab = "Samples",ylab = "OTUs",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#plot the heatmap of SAR11 37 OTUs
heatmap.2(as.matrix(sotu_part_final),main = "Heatmap of SAR11 OTUs by tree order",xlab = "Samples",ylab = "OTUs",cexCol =0.6,cexRow = 0.8,keysize = 1,lhei = c(1,7),key.xlab = "min-max scaling [log2(abs abundance+1)]",notecol="black", density.info="none", trace="none", col=col_RduBu, dendrogram="none", Colv="none", Rowv = "none")
#clusters information and phylogenetic trees were added in the Adobe Illustrator
#input the representative sequence data of all cyanobacteria oligotypes
coli_seq <- read.dna(file = "coligo aligned rep seq.fasta", format = "fasta")
as.character(coli_seq)
coli_seq_dist<- dist.dna(coli_seq, model = "TN93")
#get the log2 transformed cyanobacteria oligotype absolute abundance data
coli_abun<- read.csv("coligo absolute abun.csv",row.names = 1)
coli_abun2 <- log2(t(coli_abun)+1)
#calculate the correlation matrix of cyanobacteria oligotypes log2 transformed absolute abundance data
coli_correlation <- cor(coli_abun2)
round(coli_correlation ,2)
#Mantel correlogram of cyanobacteria oligotypes
#cyano_mantel <- mantel(coli_correlation, coli_seq_dist, method = "spearman", permutations = 9999, na.rm = TRUE)
cyano_mgram <- mantel.correlog(coli_correlation, coli_seq_dist, XY=NULL, n.class=0, break.pts=NULL,
cutoff=TRUE, r.type="spearman", nperm=999, mult="holm", progressive=TRUE)
plot(cyano_mgram, alpha=0.05)
#same process to SAR11 oligotypes to get their mantel correlogram
soli_seq <- read.dna(file = "soligo aligned rep seq.fasta", format = "fasta")
as.character(soli_seq)
soli_seq_dist<- dist.dna(soli_seq, model = "TN93")
soli_abun<- read.csv("soligo absolute abun.csv",row.names = 1)
soli_abun2 <- log2(t(soli_abun)+1)
soli_correlation <- cor(soli_abun2)
round(soli_correlation ,2)
sar11_mgram <- mantel.correlog(soli_correlation, soli_seq_dist, XY=NULL, n.class=0, break.pts=NULL,
cutoff=TRUE, r.type="spearman", nperm=999, mult="holm", progressive=TRUE)
plot(sar11_mgram, alpha=0.05)